Loading necessary packages.
Loading data.
Looking at data:
dim(noc_regions)
## [1] 230 3
summary(athlete_events)
## ID Name Sex Age
## Min. : 1 Length:271116 Length:271116 Min. :10.00
## 1st Qu.: 34643 Class :character Class :character 1st Qu.:21.00
## Median : 68205 Mode :character Mode :character Median :24.00
## Mean : 68249 Mean :25.56
## 3rd Qu.:102097 3rd Qu.:28.00
## Max. :135571 Max. :97.00
## NA's :9474
## Height Weight Team NOC
## Min. :127.0 Min. : 25.0 Length:271116 Length:271116
## 1st Qu.:168.0 1st Qu.: 60.0 Class :character Class :character
## Median :175.0 Median : 70.0 Mode :character Mode :character
## Mean :175.3 Mean : 70.7
## 3rd Qu.:183.0 3rd Qu.: 79.0
## Max. :226.0 Max. :214.0
## NA's :60171 NA's :62875
## Games Year Season City
## Length:271116 Min. :1896 Length:271116 Length:271116
## Class :character 1st Qu.:1960 Class :character Class :character
## Mode :character Median :1988 Mode :character Mode :character
## Mean :1978
## 3rd Qu.:2002
## Max. :2016
##
## Sport Event Medal
## Length:271116 Length:271116 Length:271116
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
Integer encoding the NOC data for possible later usage:
NOC_Int <- c(as.numeric(as.factor(noc_regions$NOC)))
noc_regions$NOC.Int <- NOC_Int
noc_regions
Looking at oldest and youngest athletes, finding athletes with NA age:
#ages <- athlete_events$Age
#max(athlete_events$Age, na.rm = TRUE)
oldAge <- subset(athlete_events, Age == max(athlete_events$Age, na.rm = TRUE))
youngAge <- subset(athlete_events, Age == min(athlete_events$Age, na.rm = TRUE))
list(oldAge,youngAge)
## [[1]]
## ID Name Sex Age Height Weight Team
## 257055 128719 John Quincy Adams Ward M 97 NA NA United States
## NOC Games Year Season City Sport
## 257055 USA 1928 Summer 1928 Summer Amsterdam Art Competitions
## Event Medal
## 257055 Art Competitions Mixed Sculpturing, Statues <NA>
##
## [[2]]
## ID Name Sex Age Height Weight
## 142883 71691 Dimitrios Loundras M 10 NA NA
## Team NOC Games Year Season City
## 142883 Ethnikos Gymnastikos Syllogos GRE 1896 Summer 1896 Summer Athina
## Sport Event Medal
## 142883 Gymnastics Gymnastics Men's Parallel Bars, Teams Bronze
shyAthletes <- subset(athlete_events, is.na(Age))
head(shyAthletes)
Counts of certain columns:
loadPkg("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:xts':
##
## first, last
## The following object is masked from 'package:GGally':
##
## nasa
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
sexC <- athlete_events %>% count(Sex)
ageC <- athlete_events %>% count(Age)
nameC <- athlete_events %>% count(Name)
medalC <- athlete_events %>% count(Medal)
medalC
Barchart of medal data:
medalDat <- subset(athlete_events, !is.na(Medal))
bp1 <- ggplot(athlete_events, aes(x=Medal), color = Team) + geom_bar(aes(y=(..count..)/sum(..count..)))
bp2 <- ggplot(medalDat, aes(x=Medal), color = Team) + geom_bar(aes(y=(..count..)/sum(..count..)))
grid.arrange(bp1, bp2, nrow = 1)
Integer encoding data so that it can be include in checking correlation:
mDat <- medalDat %>% mutate(Medal = replace(Medal, Medal == "Gold", 1))
mDat <- mDat %>% mutate(Medal = replace(Medal, Medal == "Silver", 2))
mDat <- mDat %>% mutate(Medal = replace(Medal, Medal == "Bronze", 3))
mDat <- mDat %>% mutate(Sex = replace(Sex, Sex == "F", 1))
mDat <- mDat %>% mutate(Sex = replace(Sex, Sex == "M", 2))
mNOCi <- c(as.numeric(as.factor(mDat$NOC)))
mSPi <- c(as.numeric(as.factor(mDat$Sport)))
mDat$NOC.Int <- mNOCi
mDat$Sport.Int <- mSPi
head(mDat)
Columns that I want to keep for correlation plot:
keeps <- c("ID","Sex","Age","Height","Weight","NOC.Int","Sport.Int","Medal")
dmDat <- mDat[ , keeps, drop = FALSE]
dmDat[,] <- sapply(dmDat[,], as.numeric)
head(dmDat)
Correlation plot:
loadPkg("digest")
corr = cor(dmDat, use="pairwise.complete.obs")
c1 <- corrplot.mixed(corr, title="Correlation Matrix", mar=c(0,0,1,0))
c2 <- corrplot(corr, method="pie", type = "upper", order = "hclust", tl.col = "black", tl.srt = 45)
#h1 <- heatmap(x=corr, symm=TRUE)
p.mat <- cor_pmat(dmDat)
gcorr1 <- ggcorrplot(corr, method = "circle", hc.order = TRUE)
gcorr2 <- ggcorrplot(corr, method = "square", hc.order = TRUE)
grid.arrange(gcorr1, gcorr2, nrow = 1)
Essentially very similar to above, just with package PerformanceAnalytics:
chart.Correlation(dmDat, histogram=TRUE, pch=19)
In the above plot: * The distribution of each variable is shown on the diagonal. * On the bottom of the diagonal : the bivariate scatter plots with a fitted line are displayed * On the top of the diagonal : the value of the correlation plus the significance level as stars * Each significance level is associated to a symbol : p-values(0, 0.001, 0.01, 0.05, 0.1, 1) <=> symbols(“***”, “**”, “*”, “.”, " “)
Starting to look at names for analysis:
loadPkg("tidyverse")
## -- Attaching packages --------------------------------------- tidyverse 1.2.1 --
## v tibble 2.1.1 v purrr 0.3.2
## v tibble 2.1.1 v forcats 0.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::combine() masks gridExtra::combine()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks xts::first()
## x dplyr::lag() masks stats::lag()
## x dplyr::last() masks xts::last()
loadPkg("rvest")
##
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
##
## pluck
## The following object is masked from 'package:readr':
##
## guess_encoding
loadPkg("magrittr")
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
loadPkg("ggmap")
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
##
## Attaching package: 'ggmap'
## The following object is masked from 'package:magrittr':
##
## inset
first.inst.NOC <- first.instance %>% count(NOC)
first.inst.NOC <- first.inst.NOC[order(first.inst.NOC$NOC),]
names(first.inst.NOC)[names(first.inst.NOC) == "n"] <- "rank"
first.inst.NOC$country = noc_regions$region
first.inst.NOC
map.world <- map_data("world")
map.world_joined <- left_join(map.world, first.inst.NOC, by = c('region' = 'country'))
map.world_joined <- map.world_joined %>% mutate(fill_flg = ifelse(is.na(rank),F,T))
register_google(key = "AIzaSyCv1WHHXGBFuIRJBQ3G-5Ijuyft_8sKeHc")
df.country_points <- data.frame(country = c("Singapore","luxembourg"),stringsAsFactors = F)
glimpse(df.country_points)
## Observations: 2
## Variables: 1
## $ country <chr> "Singapore", "luxembourg"
geocode.country_points <- geocode(df.country_points$country)
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Singapore&key=xxx-5Ijuyft_8sKeHc
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=luxembourg&key=xxx-5Ijuyft_8sKeHc
## "luxembourg" not uniquely geocoded, using "luxembourg"
df.country_points <- cbind(df.country_points,geocode.country_points)
map.world_joined
ggplot() +
geom_polygon( data=map.world_joined, aes(x=long, y=lat, group=group),
color="black", fill="lightblue" )
w <- ggplot()
w <- w + geom_polygon( data=map.world_joined,
aes(x=long, y=lat, group=group, fill = rank),
color="white", size = 0.2)
w
w <- w + scale_fill_continuous(name="Number of Olympic Athletes",
low = "lightblue", high = "darkblue",limits = c(0,9700),
breaks=c(970,1940,2910,3880,4850,5820,6790,7760,8730), na.value = "white") +
labs(title="Olympic Athletes from Around the Globe")
w
wordcloud(words = first.names.count$First.Name, freq = first.names.count$n, min.freq = 190, max.words=100, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Paired"))
wordcloud(words = lastname.edit$Last.Name, freq = lastname.edit$n, min.freq = 50, max.words=100, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Paired"))
## Warning in wordcloud(words = lastname.edit$Last.Name, freq =
## lastname.edit$n, : Robinson could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = lastname.edit$Last.Name, freq =
## lastname.edit$n, : Lewis could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = lastname.edit$Last.Name, freq =
## lastname.edit$n, : Costa could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = lastname.edit$Last.Name, freq =
## lastname.edit$n, : Khan could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = lastname.edit$Last.Name, freq =
## lastname.edit$n, : Ramrez could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = lastname.edit$Last.Name, freq =
## lastname.edit$n, : Nagy could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = lastname.edit$Last.Name, freq =
## lastname.edit$n, : Clark could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = lastname.edit$Last.Name, freq =
## lastname.edit$n, : Fischer could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = lastname.edit$Last.Name, freq =
## lastname.edit$n, : Wei could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = lastname.edit$Last.Name, freq =
## lastname.edit$n, : Evans could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = lastname.edit$Last.Name, freq =
## lastname.edit$n, : Roberts could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = lastname.edit$Last.Name, freq =
## lastname.edit$n, : Souza could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = lastname.edit$Last.Name, freq =
## lastname.edit$n, : Gutirrez could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = lastname.edit$Last.Name, freq =
## lastname.edit$n, : Meyer could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = lastname.edit$Last.Name, freq =
## lastname.edit$n, : Weber could not be fit on page. It will not be plotted.
val2 = which.max(nchar(athlete_events$Name))
athlete_events[val2,]
Looking at height/weight info:
loadPkg("plyr")
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following object is masked from 'package:purrr':
##
## compact
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
weightDat <- subset(athlete_events, !is.na(Weight))
heightDat <- subset(athlete_events, !is.na(Height))
muW <- ddply(weightDat, "Sex", summarise, grp.mean=mean(Weight), grp.variance=var(Weight), grp.standarddev=sd(Weight))
muH <- ddply(heightDat, "Sex", summarise, grp.mean=mean(Height), grp.variance=var(Height), grp.standarddev=sd(Height))
muW
muH
Standard histograms forheight/weight:
p1 <- ggplot(weightDat, aes(x=Weight, color=Sex, fill=Sex)) + geom_histogram(alpha=0.5, binwidth = 5, aes(y=..density..), position="dodge") + geom_density(alpha=0.6) + geom_vline(data=muW, aes(xintercept=grp.mean, color=Sex), linetype="dashed") +
labs(title="Weight histogram plot",x="Weight(kg)", y = "Density")
p1c <- p1 + scale_color_brewer(palette="Accent")+ scale_fill_brewer(palette="Accent") + theme_minimal()+theme(legend.position="top")
p2 <- ggplot(heightDat, aes(x=Height, color=Sex, fill=Sex)) + geom_histogram(binwidth = 5, alpha=0.5, aes(y=..density..), position="dodge") + geom_density(alpha=0.6) + geom_vline(data=muH, aes(xintercept=grp.mean, color=Sex), linetype="dashed") +
labs(title="Height histogram plot",x="Height(cm)", y = "Density")
p2c <- p2 + scale_color_brewer(palette="Accent")+ scale_fill_brewer(palette="Accent") +
theme_minimal()+theme(legend.position="top")
grid.arrange(p1c, p2c, nrow = 2)
weightDat
outlierKD2 <- function(df, var, rm=FALSE, title, colVal) {
#' Original outlierKD functino by By Klodian Dhana,
#' https://www.r-bloggers.com/identify-describe-plot-and-remove-the-outliers-from-the-dataset/
#' Modified to have third argument for removing outliers inwtead of interactive prompt,
#' and after removing outlier, original df will not be changed. The function returns the new df,
#' which can be saved as original df name if desired.
#' Check outliers, and option to remove them, save as a new dataframe.
#' @param df The dataframe.
#' @param var The variable in the dataframe to be checked for outliers
#' @param rm Boolean. Whether to remove outliers or not.
#' @return The dataframe with outliers replaced by NA if rm==TRUE, or df if nothing changed
#' @examples
#' outlierKD2(mydf, height, FALSE)
#' mydf = outlierKD2(mydf, height, TRUE)
#' mydfnew = outlierKD2(mydf, height, TRUE)
dt = df # duplicate the dataframe for potential alteration
var_name <- eval(substitute(var),eval(dt))
na1 <- sum(is.na(var_name))
m1 <- mean(var_name, na.rm = T)
par(mfrow=c(2, 3), oma=c(0,0,3,0))
boxplot(var_name, main=paste("With outliers for", title), xlab = title, col = colVal)
hist(var_name, main=paste("With outliers for", title), xlab=title, ylab="frequency", col = colVal, density = 10)
qqnorm(var_name, main=paste("Q-Q plot with outliers for", title), col = colVal)
qqline(var_name)
outlier <- boxplot.stats(var_name)$out
mo <- mean(outlier)
var_name <- ifelse(var_name %in% outlier, NA, var_name)
boxplot(var_name, main=paste("Without outliers for", title), xlab = title, col = colVal)
hist(var_name, main=paste("Without outliers for", title), xlab=title, ylab="frequency", col = colVal, density = 10)
qqnorm(var_name, main=paste("Q-Q plot without outliers for", title), col = colVal)
qqline(var_name)
title(paste("Outlier Check for ", title), outer=TRUE)
na2 <- sum(is.na(var_name))
cat("Outliers identified:", na2 - na1, "\n")
cat("Propotion (%) of outliers:", round((na2 - na1) / sum(!is.na(var_name))*100, 1), "\n")
cat("Mean of the outliers:", round(mo, 2), "\n")
m2 <- mean(var_name, na.rm = T)
cat("Mean without removing outliers:", round(m1, 2), "\n")
cat("Mean if we remove outliers:", round(m2, 2), "\n")
# response <- readline(prompt="Do you want to remove outliers and to replace with NA? [yes/no]: ")
# if(response == "y" | response == "yes"){
if(rm){
dt[as.character(substitute(var))] <- invisible(var_name)
#assign(as.character(as.list(match.call())$dt), dt, envir = .GlobalEnv)
cat("Outliers successfully removed", "\n")
return(invisible(dt))
} else {
cat("Nothing changed", "\n")
return(invisible(df))
}
}
weightDat.noOut = outlierKD2(weightDat, Weight, TRUE, "weight (kg)", "red")
## Outliers identified: 3237
## Propotion (%) of outliers: 1.6
## Mean of the outliers: 118.11
## Mean without removing outliers: 70.7
## Mean if we remove outliers: 69.95
## Outliers successfully removed
heightDat.noOut = outlierKD2(heightDat, Height, TRUE, "height (cm)", "blue")
## Outliers identified: 1338
## Propotion (%) of outliers: 0.6
## Mean of the outliers: 174.08
## Mean without removing outliers: 175.34
## Mean if we remove outliers: 175.35
## Outliers successfully removed
g1 <- ggplot(weightDat, aes(x=Sex, y=Weight, color = Sex)) + geom_boxplot() + geom_boxplot(outlier.colour="black", outlier.shape=16, outlier.size=1) + labs(title="Weight Boxplot of Olympic Athletes\nWith Outliers", x="sex", y="weight (kg)")
g2 <- ggplot(weightDat) + geom_qq(aes(sample = Weight), color = "red") + geom_qq_line(aes(sample = Weight)) + theme_minimal() + labs(title="Q-Q Plot of Weight of Olympic Athletes\nWith Outliers")
g3 <- ggplot(weightDat.noOut, aes(x=Sex, y=Weight, color = Sex)) + geom_boxplot() + geom_boxplot(outlier.colour="black", outlier.shape=16, outlier.size=1) + labs(title="Weight Boxplot of Olympic Athletes\nWithout Outliers", x="sex", y="weight (kg)")
g4 <- ggplot(weightDat.noOut) + geom_qq(aes(sample = Weight), color = "red") + geom_qq_line(aes(sample = Weight)) + theme_minimal() + labs(title="Q-Q Plot of Weight of Olympic Athletes\nWithout Outliers")
g5 <- ggplot(heightDat, aes(x=Sex, y=Height, color = Sex)) + geom_boxplot() + geom_boxplot(outlier.colour="black", outlier.shape=16, outlier.size=1) + labs(title="Height Boxplot of Olympic Athletes\nWith Outliers", x="sex", y="height (cm)")
g6 <- ggplot(weightDat) + geom_qq(aes(sample = Height), color = "red") + geom_qq_line(aes(sample = Height)) + theme_minimal() + labs(title="Q-Q Plot of Height of Olympic Athletes\nWith Outliers")
g7 <- ggplot(heightDat.noOut, aes(x=Sex, y=Height, color = Sex)) + geom_boxplot() + geom_boxplot(outlier.colour="black", outlier.shape=16, outlier.size=1) + labs(title="Height Boxplot of Olympic Athletes \nWithout Outliers", x="sex", y="height (cm)")
g8 <- ggplot(heightDat.noOut) + geom_qq(aes(sample = Height), color = "red") + geom_qq_line(aes(sample = Height)) + theme_minimal() + labs(title="Q-Q Plot of Height of Olympic Athletes \nWithout Outliers")
grid.arrange(g1,g2,g3,g4, ncol = 2, nrow = 2)
## Warning: Removed 3237 rows containing non-finite values (stat_boxplot).
## Warning: Removed 3237 rows containing non-finite values (stat_boxplot).
## Warning: Removed 3237 rows containing non-finite values (stat_qq).
## Warning: Removed 3237 rows containing non-finite values (stat_qq_line).
grid.arrange(g5,g6,g7,g8, ncol = 2, nrow = 2)
## Warning: Removed 1388 rows containing non-finite values (stat_qq).
## Warning: Removed 1388 rows containing non-finite values (stat_qq_line).
## Warning: Removed 1338 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1338 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1338 rows containing non-finite values (stat_qq).
## Warning: Removed 1338 rows containing non-finite values (stat_qq_line).
comment comment comment